This document contains Traffic Signs Classifier lab with usage of LeNet architecture. This solution is implemented with usage of TensorFlow framework.
import matplotlib.pyplot as plt
import pandas as pd
from random import randint
import numpy as np
import pickle
import cv2
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.contrib.layers import flatten
import os
import matplotlib.image as mpimg
sign_names = pd.read_csv('./signnames.csv').values
def get_sign_name(id):
for sign in sign_names:
if (sign[0] == id):
return sign[1]
%matplotlib inline
def print_examples_from_group(images, ids, grayscale = False):
line_length = 17
for sign in sign_names:
lane = plt.figure(figsize = (line_length, 1))
lane.subplots_adjust(hspace = 0, wspace = 0)
print("Id: {0}, name: {1}".format(sign[0], sign[1]))
sign_ids = np.where(ids==sign[0])[0]
for i in range(line_length):
index = randint(0, sign_ids.size - 1)
image = images[sign_ids[index]]
a = lane.add_subplot(1, line_length, i + 1, xticks=[], yticks=[])
if (grayscale):
a.imshow(image.squeeze(), cmap = "gray")
else:
a.imshow(image.squeeze())
plt.show()
First of all lets load and preview the data.
training_file = "train.p"
validation_file = "valid.p"
testing_file = "test.p"
with open(training_file, mode='rb') as f:
train = pickle.load(f)
with open(validation_file, mode='rb') as f:
valid = pickle.load(f)
with open(testing_file, mode='rb') as f:
test = pickle.load(f)
X_train_init, y_train = train['features'], train['labels']
X_valid_init, y_valid = valid['features'], valid['labels']
X_test_init, y_test = test['features'], test['labels']
assert(len(X_train_init) == len(y_train))
assert(len(X_valid_init) == len(y_valid))
assert(len(X_test_init) == len(y_test))
print()
print("Image Shape: {}".format(X_train_init[0].shape))
print()
print("Training Set: {} samples".format(len(X_train_init)))
print("Validation Set: {} samples".format(len(X_valid_init)))
print("Test Set: {} samples".format(len(X_test_init)))
Show amount of images for each traffic sign from train set. It is important to understand if there is enough train images for a sign in case of bad model performance.
hist, bins = np.histogram(y_train, bins=42)
plt.figure(figsize=(18, 15))
x_pos = [i for i, _ in enumerate(hist)]
plt.barh(x_pos, hist)
labs=[get_sign_name(j) for j in bins]
plt.yticks(x_pos, labs)
plt.show()
Show preview of the images
print_examples_from_group(X_train_init, y_train)
After images preview we can see that they are taken in different conditions (day, night, snow, with and without light reflection). On some images it is even hard to recognise the sign without preprocessing. If we train the model on this images without any preprocessing, model will give us just around 60% accuracy. So, there should be some preprocessing done for them. I tried several methods and came up with following preprocessing strategy:
There is also space for image augmentation: image rotating and transformation, but it should be performed in some intelligent way and not for all images — what will improve one image can "kill" another. So, there should be some image preprocessing, for example, recognition of sign shape and transforming it to normal. We can also try to use information of where on the image is the sign and how far is it, so we could know what transformation should be done to make shape normal. So far I did not apply any image augmentation because images in the test dataset are very different and different types of augmentation should be applied to different images.
Also, cropping of the images to sign size could help the network to learn.
def grayscale_image(img):
gs = cv2.cvtColor(img, cv2.COLOR_RGB2YCrCb)
return np.resize(gs[:,:,0], (32,32,1))
def equalise_histogram(img):
hist,bins = np.histogram(img.flatten(),256,[0,256])
cdf = hist.cumsum()
cdf_m = np.ma.masked_equal(cdf,0)
cdf_m = (cdf_m - cdf_m.min())*255/(cdf_m.max()-cdf_m.min())
cdf = np.ma.filled(cdf_m,0).astype('uint8')
return cdf[img]
def normalise_image(img):
return img/256 - 0.5
def preprocess_images(imgs):
res = []
for img in imgs:
gs = grayscale_image(img)
eq = equalise_histogram(gs)
nm = normalise_image(eq)
res.append(nm)
return res
X_train = preprocess_images(X_train_init)
X_valid = preprocess_images(X_valid_init)
X_test = preprocess_images(X_test_init)
Preview images after preprocessing
res = print_examples_from_group(X_train, y_train, grayscale = True)
EPOCHS = 50
BATCH_SIZE = 128
Implement neural network based on the LeNet-5 neural network architecture.
The LeNet architecture accepts a 32x32xC image as input, where C is the number of color channels (1 in my case because of grayscaling).
def LeNet(x):
# Arguments used for tf.truncated_normal, randomly defines variables for the weights and biases for each layer
mu = 0
sigma = 0.1
keep_prob = 0.7
# Layer 1: Convolutional. Input = 32x32x1. Output = 28x28x43.
conv1_W = tf.Variable(tf.truncated_normal(shape=(5, 5, 1, 43), mean = mu, stddev = sigma))
conv1_b = tf.Variable(tf.zeros(43))
conv1 = tf.nn.conv2d(x, conv1_W, strides=[1, 1, 1, 1], padding='VALID') + conv1_b
# Activation
conv1 = tf.nn.relu(conv1)
# Pooling. Input = 28x28x43. Output = 14x14x43.
conv1 = tf.nn.avg_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
# Layer 2: Convolutional. Input = 14x14x43 Output = 10x10x80.
conv2_W = tf.Variable(tf.truncated_normal(shape=(5, 5, 43, 80), mean = mu, stddev = sigma))
conv2_b = tf.Variable(tf.zeros(80))
conv2 = tf.nn.conv2d(conv1, conv2_W, strides=[1, 1, 1, 1], padding='VALID') + conv2_b
# Activation.
conv2 = tf.nn.relu(conv2)
# Pooling. Input = 10x10x85. Output = 5x5x85.
conv2 = tf.nn.avg_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
# Layer 3: Convolutional. Input = 5x5x85. Output = 3x3x200.
conv3_W = tf.Variable(tf.truncated_normal(shape=(2, 2, 80, 200), mean = mu, stddev = sigma))
conv3_b = tf.Variable(tf.zeros(200))
conv3 = tf.nn.conv2d(conv2, conv3_W, strides=[1, 1, 1, 1], padding='VALID') + conv3_b
# Activation.
conv3 = tf.nn.relu(conv3)
# Pooling. Input = 4x4x200. Output = 2x2x200.
conv3 = tf.nn.avg_pool(conv3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
# Layer 4: Convolutional. Output = 10x10x16.
# conv4_W = tf.Variable(tf.truncated_normal(shape=(2, 2, 200, 800), mean = mu, stddev = sigma))
# conv4_b = tf.Variable(tf.zeros(200))
# conv4 = tf.nn.conv2d(conv3, conv4_W, strides=[1, 1, 1, 1], padding='VALID') + conv4_b
# Activation.
# conv4 = tf.nn.relu(conv4)
# Pooling. Input = 10x10x16. Output = 5x5x16.
# conv4 = tf.nn.max_pool(conv4, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
# Flatten. Input = 2x2x200. Output = 1x800.
fc0 = flatten(conv3)
# Layer 3: Fully Connected. Input = 800. Output = 120.
fc1_W = tf.Variable(tf.truncated_normal(shape=(800, 120), mean = mu, stddev = sigma))
fc1_b = tf.Variable(tf.zeros(120))
fc1 = tf.matmul(fc0, fc1_W) + fc1_b
# Activation.
fc1 = tf.nn.relu(fc1)
# Dropout
fc1 = tf.nn.dropout(fc1, keep_prob)
# Layer 4: Fully Connected. Input = 120. Output = 84.
fc2_W = tf.Variable(tf.truncated_normal(shape=(120, 84), mean = mu, stddev = sigma))
fc2_b = tf.Variable(tf.zeros(84))
fc2 = tf.matmul(fc1, fc2_W) + fc2_b
# Activation.
fc2 = tf.nn.relu(fc2)
# Dropout
fc2 = tf.nn.dropout(fc2, keep_prob)
# Layer 5: Fully Connected. Input = 84. Output = 43 (we have 43 classes of signs in the dataset).
fc3_W = tf.Variable(tf.truncated_normal(shape=(84, 43), mean = mu, stddev = sigma))
fc3_b = tf.Variable(tf.zeros(43))
logits = tf.matmul(fc2, fc3_W) + fc3_b
return logits
Train LeNet to classify traffic signs data.
x is a placeholder for a batch of input images.
y is a placeholder for a batch of output labels.
x = tf.placeholder(tf.float32, (None, 32, 32, 1))
y = tf.placeholder(tf.int32, (None))
one_hot_y = tf.one_hot(y, 43)
For training pipeline I used 35 epochs and 128 batch size. Incresing of batch size to 256 did not change accuracy of the model, so I decided to stop on this value. I have also trained model with 35 epochs, but after 30 epochs accuracy is normally more or less the same.
I took relu function as suggested in lectuers for activation.
I also took Adam optimizer as optimizer from suggestion in LeNet lab.
The first architecture I used was LeNet implementation: it contained 2 convolutional layels and gave me around 86% accuracy. Encreasing it convolutional layels 3 and changing of size of each level encreased accuracy to around 92%. Increasing to 4 levels did not chage accuracy and I decided to remove 4th level.
I tuned a lot parameters of layels, for example input and output shapes, and came with parameters which satisfy me the best.
I also tried to take average pooling instead of max pooling in my model. I did not see significant effect on the results.
I also added dropout on non convolutional layels with keep probbablility of 0,7. It decreased accuracy a bit on validation and test sets, but increased accuracy on my images. Current version of notebook contains dropout.
rate = 0.001
logits = LeNet(x)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_y, logits=logits)
loss_operation = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate = rate)
training_operation = optimizer.minimize(loss_operation)
Evaluate how well the loss and accuracy of the model for a given dataset.
correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(one_hot_y, 1))
accuracy_operation = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
saver = tf.train.Saver()
def evaluate(X_data, y_data):
num_examples = len(X_data)
total_accuracy = 0
sess = tf.get_default_session()
for offset in range(0, num_examples, BATCH_SIZE):
batch_x, batch_y = X_data[offset:offset+BATCH_SIZE], y_data[offset:offset+BATCH_SIZE]
accuracy = sess.run(accuracy_operation, feed_dict={x: batch_x, y: batch_y})
total_accuracy += (accuracy * len(batch_x))
return total_accuracy / num_examples
Run the training data through the training pipeline to train the model.
Before each epoch, shuffle the training set.
After each epoch, measure the loss and accuracy of the validation set.
Save the model every time when accuracy is improved.
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
num_examples = len(X_train)
print("Training...")
print()
max_val_accuracy = 0
for i in range(EPOCHS):
X_train, y_train = shuffle(X_train, y_train)
for offset in range(0, num_examples, BATCH_SIZE):
end = offset + BATCH_SIZE
batch_x, batch_y = X_train[offset:end], y_train[offset:end]
sess.run(training_operation, feed_dict={x: batch_x, y: batch_y})
validation_accuracy = evaluate(X_valid, y_valid)
print("EPOCH {} ...".format(i+1))
print("Validation Accuracy = {:.3f}".format(validation_accuracy))
print()
if validation_accuracy > max_val_accuracy:
saver.save(sess, './lenet')
max_val_accuracy = validation_accuracy
Evaluate the performance of the model on the test set.
with tf.Session() as sess:
saver.restore(sess, tf.train.latest_checkpoint('.'))
test_accuracy = evaluate(X_test, y_test)
print("Test Accuracy = {:.3f}".format(test_accuracy))
I was able to reach around 98% accuracy on the validation set. It shows 95.6% accuracy on test set. It is not the best results and for sure not safe for production usage, so gives a room for improvements (I would suggest to stop near 99.99%).
With dropout I reached around 97% accuracy on the validation set and 95% on test set. It is lower than without dropout, but it showed better results on my test images, so I would suggest to perform improvements of preprocessing and then compare results one more time.
Improvements could be done in 3 basic places:
with tf.Session() as sess:
saver.restore(sess, tf.train.latest_checkpoint('.'))
results = []
for i in range(43):
sign_ids = np.where(y_test==i)[0]
observed_images = []
observed_labels = []
for id in sign_ids:
observed_images.append(X_test[id])
observed_labels.append(i)
test_accuracy = evaluate(observed_images, observed_labels)
results.append(test_accuracy)
print("Evaluation per group")
plt.figure(figsize=(18, 15))
x_pos = range(43)
plt.barh(x_pos, results)
labs=[get_sign_name(j) for j in x_pos]
plt.yticks(x_pos, labs)
plt.show()
We see that some signs has bad accuracy in recognition. For some of them it is dangerous, for example, "Pedestrians". Possible solutions how to fix that:
One more interesting observation is that there is no correlation between amount of images in the train dataset and accuracy. This may be because some of images are just easier to detect and another are harder.
#printing out some stats and plotting
plt.close("all")
my_images_raw = []
my_labels = []
for file in os.listdir("my_image_set/"):
try:
image = mpimg.imread('my_image_set/' + file)
image = (image * 255).round().astype(np.uint8)
my_images_raw.append(image)
my_labels.append(int(file.split(".")[0]))
except:
print("Bad file " + file)
my_images = preprocess_images(my_images_raw)
def print_images(img, cmap = None):
lane = plt.figure(figsize = (len(img), 1))
lane.subplots_adjust(hspace = 0, wspace = 0)
for i in range (len(img)):
image = img[i]
a = lane.add_subplot(1, len(img), i + 1, xticks=[], yticks=[])
a.imshow(image.squeeze(), cmap = cmap)
plt.show()
print("My images test set: ")
print_images(my_images_raw, cmap = "gray")
print("Preprocessed: ")
print_images(my_images, cmap = "gray")
with tf.Session() as sess:
saver.restore(sess, tf.train.latest_checkpoint('.'))
test_accuracy = evaluate(my_images, my_labels)
print("My images recognition accuracy = {:.3f}".format(test_accuracy))
TOP_K = 5
with tf.Session() as sess:
saver.restore(sess, tf.train.latest_checkpoint('.'))
my_images_softmax = sess.run(tf.nn.top_k(tf.nn.softmax(logits), k=TOP_K), feed_dict={x:my_images})
plt.figure(figsize=(18, 25))
for i in range(len(my_images)):
plt.subplot(12, 2, 2*i + 1)
image = my_images_raw[i]
plt.imshow(image.squeeze(), cmap = "gray")
plt.axis('off')
plt.subplot(12, 2, 2*i + 2)
plt.barh(np.arange(1, 6, 1), my_images_softmax.values[i, :])
labs=[get_sign_name(j) for j in my_images_softmax.indices[i]]
plt.yticks(np.arange(1, 6, 1), labs)
plt.show()
For my images taken from German Driving video model performs not very good when I ran model without dropout. There are three signs with bad recognition results:
With dropout results on my test set was very good: all signs are recognized correctly and most of them with confidence closed to 1.